package com.jms.crawler.util;

/**
 * 数据预处理
 * @author liu_jshan
 * @version 1.0 20240510
 */
public class DataPreprocess {

    private static final String unicode = "[^\\u0000-\\uFFFF]";
    private static final String n = "\\u000A";//换行
    private static final String html = "<([^>]*)>";//html标签
    private static final String url = "(http|ftp|https)://[\\w\\-_]+(\\.[\\w\\-_]+)+([\\w\\-.,@?^=%&:/~+#]*[\\w\\-@?^=%&/~+#])?";//超链接
    private static final String reply = "(回复)?@[\\u4e00-\\u9fa5a-zA-Z0-9_-]{0,15}:?";//@...
    private static final String topic = "#[^#]+#";//话题内容
    private static final String special = "[\\u2600-\\u27BF|\\u2000-\\u200F]";//一些特殊字符

    private static final String report = "转发微博";

    public static String preprocess(String str) {
        str = str.replaceAll(unicode, "");
        str = str.replaceAll(n, "");
        str = str.replaceAll(html, "");
        str = str.replaceAll(url, "");
        str = str.replaceAll(reply, "");
        str = str.replaceAll(topic, "");
        str = str.replaceAll(special, "");
        str = str.replace(report, "");
        return str.trim();
    }

}
